Data Transformation¶

Task: Convert given dataset to desired format using Melting¶

In [90]:
import pandas as pd
In [192]:
df = pd.read_csv('billings_europe.csv', index_col=0, skiprows = 4, header=[0])
C:\Users\DELL\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
In [193]:
df
Out[193]:
Unnamed: 1 Europe - On Trailing EY Consumer Discretionary - On Trailing EY Consumer Staples - On Trailing EY Energy - On Trailing EY Financials - On Trailing EY Health Care - On Trailing EY Industrials - On Trailing EY Information Technology - On Trailing EY Materials - On Trailing EY ... Greece - On Trailing BVY Ireland - On Trailing BVY Italy - On Trailing BVY Netherlands - On Trailing BVY Norway - On Trailing BVY Portugal - On Trailing BVY Spain - On Trailing BVY Sweden - On Trailing BVY Switzerland - On Trailing BVY United Kingdom - On Trailing BVY
NaN NaN Market Sectors NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
NaN NaN EURO EURO_25 EURO_30 EURO_10 EURO_40 EURO_35 EURO_20 EURO_45 EURO_15 ... GR IE IT NL NO PT ES SE CH GB
31-Dec-74 NaN 27536 26861 25887.8 51400.2 28288 10853.8 21598.2 13201.2 19206.8 ... NaN NaN 25640.4 21079.6 13796.8 NaN 21978 16744.6 24155.2 14720.6
1-Jan-75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2-Jan-75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
28-Aug-17 NaN 19540.2 23407.4 12484.4 32322.0 13542.0 14018.6 11966.4 7833.2 15034.0 ... NaN 22085.2 18572.6 32965.2 27484.4 NaN 33455.0 18183.6 42591.2 29264.6
29-Aug-17 NaN 19337.8 23310.8 12297.2 32124.6 13323.0 13779.2 11956.4 8161.8 14934.2 ... NaN 22369.8 18520.8 33006.8 27707.8 NaN 34186.8 17974.0 42936.4 28631.0
30-Aug-17 NaN 19683.4 23461.4 12227.6 31858.2 13161.2 13894.2 11882.6 8057.4 14773.6 ... NaN 22179.2 18546.0 32567.8 27414.6 NaN 33779.0 18208.2 42681.2 28686.0
31-Aug-17 NaN 19514.2 23859.8 12076.2 31062.6 13425.6 13968.8 12444.4 7895.2 20041.2 ... NaN 21422.4 19010.4 32488.6 26173.2 NaN 35239.2 19611.0 42931.4 28286.2
1-Sep-17 NaN 19482.2 23866.8 12058.2 44960.4 13429.4 13774.2 11782.2 8763.0 19971.6 ... NaN 21327.2 18827.4 32908.8 25365.8 NaN 35336.0 19979.0 43008.4 29284.2

11136 rows × 260 columns

In [196]:
del df['Unnamed: 1']
In [197]:
df
Out[197]:
Europe - On Trailing EY Consumer Discretionary - On Trailing EY Consumer Staples - On Trailing EY Energy - On Trailing EY Financials - On Trailing EY Health Care - On Trailing EY Industrials - On Trailing EY Information Technology - On Trailing EY Materials - On Trailing EY Telecommunication Services - On Trailing EY ... Greece - On Trailing BVY Ireland - On Trailing BVY Italy - On Trailing BVY Netherlands - On Trailing BVY Norway - On Trailing BVY Portugal - On Trailing BVY Spain - On Trailing BVY Sweden - On Trailing BVY Switzerland - On Trailing BVY United Kingdom - On Trailing BVY
NaN Market Sectors NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
NaN EURO EURO_25 EURO_30 EURO_10 EURO_40 EURO_35 EURO_20 EURO_45 EURO_15 EURO_50 ... GR IE IT NL NO PT ES SE CH GB
31-Dec-74 27536 26861 25887.8 51400.2 28288 10853.8 21598.2 13201.2 19206.8 17942 ... NaN NaN 25640.4 21079.6 13796.8 NaN 21978 16744.6 24155.2 14720.6
1-Jan-75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2-Jan-75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
28-Aug-17 19540.2 23407.4 12484.4 32322.0 13542.0 14018.6 11966.4 7833.2 15034.0 34851.2 ... NaN 22085.2 18572.6 32965.2 27484.4 NaN 33455.0 18183.6 42591.2 29264.6
29-Aug-17 19337.8 23310.8 12297.2 32124.6 13323.0 13779.2 11956.4 8161.8 14934.2 35208.6 ... NaN 22369.8 18520.8 33006.8 27707.8 NaN 34186.8 17974.0 42936.4 28631.0
30-Aug-17 19683.4 23461.4 12227.6 31858.2 13161.2 13894.2 11882.6 8057.4 14773.6 35383.0 ... NaN 22179.2 18546.0 32567.8 27414.6 NaN 33779.0 18208.2 42681.2 28686.0
31-Aug-17 19514.2 23859.8 12076.2 31062.6 13425.6 13968.8 12444.4 7895.2 20041.2 30423.2 ... NaN 21422.4 19010.4 32488.6 26173.2 NaN 35239.2 19611.0 42931.4 28286.2
1-Sep-17 19482.2 23866.8 12058.2 44960.4 13429.4 13774.2 11782.2 8763.0 19971.6 30479.0 ... NaN 21327.2 18827.4 32908.8 25365.8 NaN 35336.0 19979.0 43008.4 29284.2

11136 rows × 259 columns

In [198]:
df.iloc[0] = df.iloc[0].fillna(method='ffill')
In [199]:
df.head()
Out[199]:
Europe - On Trailing EY Consumer Discretionary - On Trailing EY Consumer Staples - On Trailing EY Energy - On Trailing EY Financials - On Trailing EY Health Care - On Trailing EY Industrials - On Trailing EY Information Technology - On Trailing EY Materials - On Trailing EY Telecommunication Services - On Trailing EY ... Greece - On Trailing BVY Ireland - On Trailing BVY Italy - On Trailing BVY Netherlands - On Trailing BVY Norway - On Trailing BVY Portugal - On Trailing BVY Spain - On Trailing BVY Sweden - On Trailing BVY Switzerland - On Trailing BVY United Kingdom - On Trailing BVY
NaN Market Sectors Sectors Sectors Sectors Sectors Sectors Sectors Sectors Sectors ... Countries Countries Countries Countries Countries Countries Countries Countries Countries Countries
NaN EURO EURO_25 EURO_30 EURO_10 EURO_40 EURO_35 EURO_20 EURO_45 EURO_15 EURO_50 ... GR IE IT NL NO PT ES SE CH GB
31-Dec-74 27536 26861 25887.8 51400.2 28288 10853.8 21598.2 13201.2 19206.8 17942 ... NaN NaN 25640.4 21079.6 13796.8 NaN 21978 16744.6 24155.2 14720.6
1-Jan-75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2-Jan-75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 259 columns

In [205]:
df1 = df.transpose()
In [206]:
df1
Out[206]:
NaN NaN 31-Dec-74 1-Jan-75 2-Jan-75 3-Jan-75 6-Jan-75 7-Jan-75 8-Jan-75 9-Jan-75 ... 21-Aug-17 22-Aug-17 23-Aug-17 24-Aug-17 25-Aug-17 28-Aug-17 29-Aug-17 30-Aug-17 31-Aug-17 1-Sep-17
Europe - On Trailing EY Market EURO 27536 NaN NaN NaN NaN NaN NaN NaN ... 20038.0 19947.4 19945.2 19774.4 NaN 19540.2 19337.8 19683.4 19514.2 19482.2
Consumer Discretionary - On Trailing EY Sectors EURO_25 26861 NaN NaN NaN NaN NaN NaN NaN ... 23846.6 24013.4 23757.4 23981.8 NaN 23407.4 23310.8 23461.4 23859.8 23866.8
Consumer Staples - On Trailing EY Sectors EURO_30 25887.8 NaN NaN NaN NaN NaN NaN NaN ... 12138.6 11894.6 11953.0 11968.6 NaN 12484.4 12297.2 12227.6 12076.2 12058.2
Energy - On Trailing EY Sectors EURO_10 51400.2 NaN NaN NaN NaN NaN NaN NaN ... 32243.2 32511.8 32396.4 32150.4 NaN 32322.0 32124.6 31858.2 31062.6 44960.4
Financials - On Trailing EY Sectors EURO_40 28288 NaN NaN NaN NaN NaN NaN NaN ... 13012.6 13102.4 13041.8 13201.6 NaN 13542.0 13323.0 13161.2 13425.6 13429.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Portugal - On Trailing BVY Countries PT NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Spain - On Trailing BVY Countries ES 21978 NaN NaN NaN NaN NaN NaN NaN ... 33697.2 33257.4 33657.2 33331.0 NaN 33455.0 34186.8 33779.0 35239.2 35336.0
Sweden - On Trailing BVY Countries SE 16744.6 NaN NaN NaN NaN NaN NaN NaN ... 18404.6 18524.6 18403.4 18419.8 NaN 18183.6 17974.0 18208.2 19611.0 19979.0
Switzerland - On Trailing BVY Countries CH 24155.2 NaN NaN NaN NaN NaN NaN NaN ... 42544.6 42477.0 42683.2 42676.4 NaN 42591.2 42936.4 42681.2 42931.4 43008.4
United Kingdom - On Trailing BVY Countries GB 14720.6 NaN NaN NaN NaN NaN NaN NaN ... 31602.4 31010.0 30965.8 29532.8 NaN 29264.6 28631.0 28686.0 28286.2 29284.2

259 rows × 11136 columns

In [208]:
s = pd.Series(df1.columns)
In [210]:
s = s.fillna('unnamed:' + (s.groupby(s.isnull()).cumcount() + 1).astype(str))
In [216]:
df1.columns = s
df1 = df1.reset_index()
In [217]:
df1.head()
Out[217]:
index unnamed:1 unnamed:2 31-Dec-74 1-Jan-75 2-Jan-75 3-Jan-75 6-Jan-75 7-Jan-75 8-Jan-75 ... 21-Aug-17 22-Aug-17 23-Aug-17 24-Aug-17 25-Aug-17 28-Aug-17 29-Aug-17 30-Aug-17 31-Aug-17 1-Sep-17
0 Europe - On Trailing EY Market EURO 27536 NaN NaN NaN NaN NaN NaN ... 20038.0 19947.4 19945.2 19774.4 NaN 19540.2 19337.8 19683.4 19514.2 19482.2
1 Consumer Discretionary - On Trailing EY Sectors EURO_25 26861 NaN NaN NaN NaN NaN NaN ... 23846.6 24013.4 23757.4 23981.8 NaN 23407.4 23310.8 23461.4 23859.8 23866.8
2 Consumer Staples - On Trailing EY Sectors EURO_30 25887.8 NaN NaN NaN NaN NaN NaN ... 12138.6 11894.6 11953.0 11968.6 NaN 12484.4 12297.2 12227.6 12076.2 12058.2
3 Energy - On Trailing EY Sectors EURO_10 51400.2 NaN NaN NaN NaN NaN NaN ... 32243.2 32511.8 32396.4 32150.4 NaN 32322.0 32124.6 31858.2 31062.6 44960.4
4 Financials - On Trailing EY Sectors EURO_40 28288 NaN NaN NaN NaN NaN NaN ... 13012.6 13102.4 13041.8 13201.6 NaN 13542.0 13323.0 13161.2 13425.6 13429.4

5 rows × 11137 columns

In [218]:
melted = pd.melt(df1, id_vars=['index','unnamed:1','unnamed:2'])
In [219]:
melted.head(10)
Out[219]:
index unnamed:1 unnamed:2 variable value
0 Europe - On Trailing EY Market EURO 31-Dec-74 27536
1 Consumer Discretionary - On Trailing EY Sectors EURO_25 31-Dec-74 26861
2 Consumer Staples - On Trailing EY Sectors EURO_30 31-Dec-74 25887.8
3 Energy - On Trailing EY Sectors EURO_10 31-Dec-74 51400.2
4 Financials - On Trailing EY Sectors EURO_40 31-Dec-74 28288
5 Health Care - On Trailing EY Sectors EURO_35 31-Dec-74 10853.8
6 Industrials - On Trailing EY Sectors EURO_20 31-Dec-74 21598.2
7 Information Technology - On Trailing EY Sectors EURO_45 31-Dec-74 13201.2
8 Materials - On Trailing EY Sectors EURO_15 31-Dec-74 19206.8
9 Telecommunication Services - On Trailing EY Sectors EURO_50 31-Dec-74 17942
In [220]:
melted[['Segment','Period']] = melted['index'].str.split('-', expand=True)
In [221]:
melted.head()
Out[221]:
index unnamed:1 unnamed:2 variable value Segment Period
0 Europe - On Trailing EY Market EURO 31-Dec-74 27536 Europe On Trailing EY
1 Consumer Discretionary - On Trailing EY Sectors EURO_25 31-Dec-74 26861 Consumer Discretionary On Trailing EY
2 Consumer Staples - On Trailing EY Sectors EURO_30 31-Dec-74 25887.8 Consumer Staples On Trailing EY
3 Energy - On Trailing EY Sectors EURO_10 31-Dec-74 51400.2 Energy On Trailing EY
4 Financials - On Trailing EY Sectors EURO_40 31-Dec-74 28288 Financials On Trailing EY
In [222]:
del melted['index']
In [223]:
melted.head()
Out[223]:
unnamed:1 unnamed:2 variable value Segment Period
0 Market EURO 31-Dec-74 27536 Europe On Trailing EY
1 Sectors EURO_25 31-Dec-74 26861 Consumer Discretionary On Trailing EY
2 Sectors EURO_30 31-Dec-74 25887.8 Consumer Staples On Trailing EY
3 Sectors EURO_10 31-Dec-74 51400.2 Energy On Trailing EY
4 Sectors EURO_40 31-Dec-74 28288 Financials On Trailing EY
In [227]:
melted = melted.reset_index()
In [228]:
melted.head()
Out[228]:
index unnamed:1 unnamed:2 variable value Segment Period
0 0 Market EURO 31-Dec-74 27536 Europe On Trailing EY
1 1 Sectors EURO_25 31-Dec-74 26861 Consumer Discretionary On Trailing EY
2 2 Sectors EURO_30 31-Dec-74 25887.8 Consumer Staples On Trailing EY
3 3 Sectors EURO_10 31-Dec-74 51400.2 Energy On Trailing EY
4 4 Sectors EURO_40 31-Dec-74 28288 Financials On Trailing EY
In [229]:
del melted['index']
In [230]:
melted.rename(columns = {'unnamed:1':'Type', 'unnamed:2':'Subtype', 'variable':'Date'
                         , 'Segment':'Segment', 'Period':'Period','value':'Value'}, inplace = True)
In [231]:
melted.head()
Out[231]:
Type Subtype Date Value Segment Period
0 Market EURO 31-Dec-74 27536 Europe On Trailing EY
1 Sectors EURO_25 31-Dec-74 26861 Consumer Discretionary On Trailing EY
2 Sectors EURO_30 31-Dec-74 25887.8 Consumer Staples On Trailing EY
3 Sectors EURO_10 31-Dec-74 51400.2 Energy On Trailing EY
4 Sectors EURO_40 31-Dec-74 28288 Financials On Trailing EY
In [232]:
melted_sorted = melted[['Date', 'Segment', 'Period', 'Type', 'Subtype','Value']]
In [233]:
melted_sorted.head()
Out[233]:
Date Segment Period Type Subtype Value
0 31-Dec-74 Europe On Trailing EY Market EURO 27536
1 31-Dec-74 Consumer Discretionary On Trailing EY Sectors EURO_25 26861
2 31-Dec-74 Consumer Staples On Trailing EY Sectors EURO_30 25887.8
3 31-Dec-74 Energy On Trailing EY Sectors EURO_10 51400.2
4 31-Dec-74 Financials On Trailing EY Sectors EURO_40 28288
In [242]:
# Checking for missing values
percent_missing = melted_sorted.isnull().sum() * 100 / len(melted_sorted) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing
Out[242]:
Date       0.0%
Segment    0.0%
Period     0.0%
Type       0.0%
Subtype    0.0%
Value      0.0%
dtype: object
In [241]:
melted_sorted = melted_sorted.dropna()
In [247]:
# Checking for duplicate rows in the dataset
dup_percentage = melted_sorted.duplicated().sum()/len(melted_sorted)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage
Out[247]:
'0.0%'
In [273]:
melted_sorted['Value'] = melted_sorted["Value"].astype("float")
In [280]:
melted_sorted["Date"] = melted_sorted["Date"].astype('datetime64[ns]')
In [274]:
df_country = melted_sorted[melted_sorted['Type'] == 'Countries'] 
In [275]:
df_country = df_country.reset_index()
In [276]:
del df_country['index']
In [277]:
df_country.head()
Out[277]:
Date Segment Period Type Subtype Value
0 31-Dec-74 Austria On Trailing EY Countries AT 10148.6
1 31-Dec-74 Belgium On Trailing EY Countries BE 7918.6
2 31-Dec-74 Denmark On Trailing EY Countries DK 17596.4
3 31-Dec-74 France On Trailing EY Countries FR 24993.4
4 31-Dec-74 Germany On Trailing EY Countries DE 13451.6
In [278]:
df_country.dtypes
Out[278]:
Date        object
Segment     object
Period      object
Type        object
Subtype     object
Value      float64
dtype: object
In [279]:
df3 = df_country.groupby('Segment').sum()
df3
Out[279]:
Value
Segment
Austria 74103620.8
Belgium 87917478.2
Denmark 128206611.6
Finland 60853596.0
France 102490130.8
Germany 106702182.0
Greece 14181895.0
Ireland 81519071.0
Italy 116358047.2
Netherlands 104444243.0
Norway 109394855.6
Portugal 19975413.4
Spain 98959061.0
Sweden 79518933.2
Switzerland 104617768.2
United Kingdom 93540781.8
In [293]:
melted_sorted['Date'] >= '2016-01-01'
Out[293]:
0          False
1          False
2          False
3          False
4          False
           ...  
2883700     True
2883702     True
2883703     True
2883704     True
2883705     True
Name: Date, Length: 240922, dtype: bool
In [292]:
#df_period = melted_sorted[melted_sorted['Type'] == 'Market' & melted_sorted['Date'] >= '2016-01-01'] 
In [ ]: